# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
export MG_DATA_PATH=${HF_DATA_PATH/hf_data/mg_data}
mkdir -p ${MG_DATA_PATH}
python ./preprocess_data.py \
	--input ${HF_DATA_PATH} \
	--tokenizer-name-or-path ${HF_MODEL_PATH} \
	--output-prefix ${MG_DATA_PATH} \
	--handler-name AlpacaStyleInstructionHandler \
	--tokenizer-type PretrainedFromHF \
	--workers 16 \
	--log-interval 1000 \
	--prompt-type qwen
  # --map-keys '{"prompt":"instruction","query":"input","response":"output"}' # 默认值，可不传